In [1]:
from IPython.core.display import display, HTML
display(HTML("""<style> .container {width:96% !important;}</style>"""))

from IPython.display import IFrame
In [2]:
import pandas as pd
import numpy as np
from plotly.offline import init_notebook_mode, iplot
import cufflinks as cf
init_notebook_mode()
cf.go_offline()
from __future__ import division
In [3]:
import sys
sys.path.insert(0,'../')
from utils.paths import *

Introduction of SBA

(Small Business Administration)

Business case

SBA acts much like an insurance provider to reduce the risk for a bank by taking on some of the risk through guaranteeing a portion of the loan.

  • What is the risk level of SBA?
  • What is the overall loss ratio of SBA?
  • Can we improve it using machine learning?

Experiment:

  1. Study the loss ratio in 2015
  2. Use 2003 and 2004 data to build a machine learning model
  3. Use this model to predict default in 2005

In [4]:
nat = pd.read_csv(path_SBA + 'SBAnational_new.csv', sep = ';', low_memory=False)
In [5]:
nat5 = nat[nat.ApprovalFY.isin([2005])].reset_index(drop = True)

Assign grades based on SBA ratio

Let's assign grades for SBA based on the SBA ratio, i.e. grade 5 for below 0.2, grade 4 for ratio between 0.2 to 0.4

In [6]:
sba_ratio_th = [0, 0.2, 0.4, 0.6, 0.8, 1]
sba_grades = [5, 4, 3, 2, 1]

nat5['SBA_grades'] = pd.cut(nat5.SBA_ratio, bins = sba_ratio_th, labels = sba_grades)
In [7]:
nat5.SBA_grades.value_counts().sort_index(ascending = False)
Out[7]:
1    17294
2    10931
3    48706
4       23
5        3
Name: SBA_grades, dtype: int64
In [8]:
def default_cat_table(data, cat):
    default_cat = data.groupby([cat, 'default']).count().max(1).unstack()
    default_cat['ALL'] = data[cat].value_counts()
    default_cat['default_rate'] = (default_cat[1] / default_cat['ALL'])
    default_cat = default_cat.rename(columns = {1: 'Default', 0: 'Non-default'})
    return default_cat

Default rate of each grades (SBA)

In [9]:
default_cat_table(nat5, 'SBA_grades').sort_index(ascending = False)
Out[9]:
default Non-default Default ALL default_rate
SBA_grades
1 13285.0 4009.0 17294 0.231815
2 8999.0 1932.0 10931 0.176745
3 35169.0 13537.0 48706 0.277933
4 22.0 1.0 23 0.043478
5 3.0 NaN 3 NaN
In [10]:
def sba_claim(chgoffamount, sba_app):
    if chgoffamount == 0:
        return 0
    else:
        if sba_app <= chgoffamount:
            claim = sba_app
        else:
            claim = chgoffamount
        return claim
    
claim = nat5.apply(lambda x: sba_claim(x['ChgOffPrinGr'], x['SBA_Appv']), axis = 1)
nat5['SBA_claim'] = claim

Claim and loss ratio (SBA)

In [11]:
print 'Overall lost ratio = ', round(nat5.SBA_claim.sum() / nat5.SBA_Appv.sum(), 3)

print 'Lost ratio for each grade: '

SBA_grade_df = pd.DataFrame(nat5.groupby('SBA_grades').GrAppv.sum())

SBA_grade_df['no. of cases'] = nat5.SBA_grades.value_counts()
SBA_grade_df['SBA_Appv'] = nat5.groupby('SBA_grades').SBA_Appv.sum()
SBA_grade_df['SBA_claim'] = nat5.groupby('SBA_grades').SBA_claim.sum()
SBA_grade_df['SBA_loss_ratio'] = SBA_grade_df.SBA_claim / SBA_grade_df.SBA_Appv
SBA_grade_df.sort_index(ascending = False)
Overall lost ratio =  0.113
Lost ratio for each grade: 
Out[11]:
GrAppv no. of cases SBA_Appv SBA_claim SBA_loss_ratio
SBA_grades
1 3.182030e+09 17294 3.051684e+09 120698864.0 0.039552
2 5.645468e+09 10931 4.221814e+09 533314200.0 0.126323
3 2.759550e+09 48706 1.383032e+09 324853975.0 0.234885
4 1.586210e+07 23 4.916853e+06 284994.0 0.057963
5 1.907000e+06 3 2.552700e+05 0.0 0.000000
In [12]:
nat5.head()
Out[12]:
LoanNr_ChkDgt Name City State Zip Bank BankState NAICS ApprovalDate ApprovalFY ... suffix Loan_age Previous_loan default_record_dict default_times zip fips BusinessType SBA_grades SBA_claim
0 1075325003 Gilly's, LLC LAKE MARY FL 32746 BANK OF AMERICA NATL ASSOC NC 423920 2004-10-05 2005 ... LLC 0 0 NaN 0 32746.0 12117.0 CORPORATION 3 0.0
1 1075335006 Gerald Christiansen dba Big G MANTI UT 84642 ZIONS FIRST NATIONAL BANK UT 236118 2004-10-05 2005 ... NO SUFFIX 0 0 NaN 0 84642.0 49039.0 INDIVIDUAL 3 0.0
2 1075345009 Steven Wilson Louisville KY 40229 PNC BANK, NATIONAL ASSOCIATION KY 238210 2004-10-05 2005 ... NO SUFFIX 0 0 NaN 0 40229.0 21029.0 INDIVIDUAL 3 0.0
3 1075355001 Duraport Marine And Rail Termi BAYONNE NJ 7002 BANK OF AMERICA NATL ASSOC RI 541611 2004-10-05 2005 ... NO SUFFIX 0 0 NaN 0 7002.0 34017.0 NaN 3 0.0
4 1075365004 Rucinsky's Painting and Decora OSHKOSH WI 54901 ASSOCIATED BANK NATL ASSOC WI 531120 2004-10-05 2005 ... NO SUFFIX 0 0 NaN 0 54901.0 55139.0 CORPORATION 3 0.0

5 rows × 48 columns

Use Machine Learning to grade the companies

  • See notebook - '03 - Experiment - machine learning - model tuning'

Result from machine learning

In [13]:
projection = pd.read_csv(path_SBA + 'result_table_proj.csv', sep = ';', low_memory=False)
projection = projection.rename(columns = {'Grade': 'ML_grades'})
nat5['ML_grades'] = projection['ML_grades']

Define new SBA ratio with machine learning grades

  • Grade 1 : 100% coverage
  • Grade 2 : 80% coverage
  • Grade 3 : 60% coverage
  • Grade 4 : 40% coverage
  • Grade 5 : 20% coverage
In [14]:
def sba_ratio_ml(grade):
    new_ratio = {1: 1,
                 2: 0.8,
                 3: 0.6,
                 4: 0.4,
                 5: 0.2}
    return new_ratio[grade]
In [15]:
nat5.loc[:, 'ML_SBA_ratio'] = nat5.ML_grades.apply(sba_ratio_ml)
nat5.loc[:, 'ML_SBA_Appv'] = nat5.GrAppv * nat5.ML_SBA_ratio
In [16]:
# claim if using ML grades

nat5.loc[:, 'ML_SBA_claim'] = nat5.apply(lambda x: sba_claim(x['ChgOffPrinGr'], x['ML_SBA_Appv']), axis = 1)

nat5.groupby('ML_grades').ML_SBA_claim.sum()
Out[16]:
ML_grades
1     83615000.0
2    164286548.2
3    192030002.4
4    177285631.2
5    106909154.6
Name: ML_SBA_claim, dtype: float64
In [17]:
nat5[['SBA_claim', 'ML_SBA_claim']].sum()
Out[17]:
SBA_claim       979152033.0
ML_SBA_claim    724126336.4
dtype: float64
In [18]:
print 'Total insured amount = ', nat5.ML_SBA_Appv.sum()
print 'Total claim = ', nat5.ML_SBA_claim.sum()
print 'Overall lost ratio after using ML grades = ', round(nat5.ML_SBA_claim.sum() / nat5.ML_SBA_Appv.sum(), 3)
Total insured amount =  8581364085.8
Total claim =  724126336.4
Overall lost ratio after using ML grades =  0.084
In [19]:
nat5.head()
Out[19]:
LoanNr_ChkDgt Name City State Zip Bank BankState NAICS ApprovalDate ApprovalFY ... default_times zip fips BusinessType SBA_grades SBA_claim ML_grades ML_SBA_ratio ML_SBA_Appv ML_SBA_claim
0 1075325003 Gilly's, LLC LAKE MARY FL 32746 BANK OF AMERICA NATL ASSOC NC 423920 2004-10-05 2005 ... 0 32746.0 12117.0 CORPORATION 3 0.0 3 0.6 6000.0 0.0
1 1075335006 Gerald Christiansen dba Big G MANTI UT 84642 ZIONS FIRST NATIONAL BANK UT 236118 2004-10-05 2005 ... 0 84642.0 49039.0 INDIVIDUAL 3 0.0 2 0.8 28800.0 0.0
2 1075345009 Steven Wilson Louisville KY 40229 PNC BANK, NATIONAL ASSOCIATION KY 238210 2004-10-05 2005 ... 0 40229.0 21029.0 INDIVIDUAL 3 0.0 2 0.8 8000.0 0.0
3 1075355001 Duraport Marine And Rail Termi BAYONNE NJ 7002 BANK OF AMERICA NATL ASSOC RI 541611 2004-10-05 2005 ... 0 7002.0 34017.0 NaN 3 0.0 2 0.8 80000.0 0.0
4 1075365004 Rucinsky's Painting and Decora OSHKOSH WI 54901 ASSOCIATED BANK NATL ASSOC WI 531120 2004-10-05 2005 ... 0 54901.0 55139.0 CORPORATION 3 0.0 1 1.0 225000.0 0.0

5 rows × 52 columns

In [20]:
print 'Lost ratio for each grade: '

ML_SBA_grade_df = pd.DataFrame(nat5.groupby('ML_grades').GrAppv.sum())

ML_SBA_grade_df['no. of cases'] = nat5.ML_grades.value_counts()
ML_SBA_grade_df['ML_SBA_Appv'] = nat5.groupby('ML_grades').ML_SBA_Appv.sum()
ML_SBA_grade_df['ML_SBA_claim'] = nat5.groupby('ML_grades').ML_SBA_claim.sum()
ML_SBA_grade_df['ML_SBA_loss_ratio'] = ML_SBA_grade_df.ML_SBA_claim / ML_SBA_grade_df.ML_SBA_Appv
ML_SBA_grade_df.sort_index(ascending = True)
Lost ratio for each grade: 
Out[20]:
GrAppv no. of cases ML_SBA_Appv ML_SBA_claim ML_SBA_loss_ratio
ML_grades
1 5.047406e+09 11030 5.047406e+09 83615000.0 0.016566
2 2.020042e+09 14667 1.616033e+09 164286548.2 0.101660
3 1.723608e+09 15393 1.034165e+09 192030002.4 0.185686
4 1.605039e+09 15989 6.420155e+08 177285631.2 0.276139
5 1.208722e+09 19878 2.417444e+08 106909154.6 0.442240

Compare between original grades and machine learning grades

In [21]:
nat5[['SBA_Appv', 'ML_SBA_Appv']].sum().iplot(kind = 'bar', title = 'Total insured amount', yTitle = 'USD',
                                             dimensions = [600, 400])

nat5[['SBA_claim', 'ML_SBA_claim']].sum().iplot(kind = 'bar', title = 'Total claim amount', yTitle = 'USD',
                                             dimensions = [600, 400])
In [22]:
# Saving 25.5 million 
nat5[['SBA_claim', 'ML_SBA_claim']].sum().diff() / 10e6
Out[22]:
SBA_claim            NaN
ML_SBA_claim   -25.50257
dtype: float64
In [23]:
(np.array(nat5[['SBA_claim', 'ML_SBA_claim']].sum()) / nat5[['SBA_Appv', 'ML_SBA_Appv']].sum()).iplot(kind = 'bar', title = 'Loss ratio', yTitle = 'loss ratio',
                                             dimensions = [600, 400])
In [24]:
# Improve loss ratio by ~ 3%
(np.array(nat5[['SBA_claim', 'ML_SBA_claim']].sum()) / nat5[['SBA_Appv', 'ML_SBA_Appv']].sum()).diff()
Out[24]:
SBA_Appv           NaN
ML_SBA_Appv   -0.02866
dtype: float64
In [25]:
display(SBA_grade_df, ML_SBA_grade_df)
GrAppv no. of cases SBA_Appv SBA_claim SBA_loss_ratio
SBA_grades
5 1.907000e+06 3 2.552700e+05 0.0 0.000000
4 1.586210e+07 23 4.916853e+06 284994.0 0.057963
3 2.759550e+09 48706 1.383032e+09 324853975.0 0.234885
2 5.645468e+09 10931 4.221814e+09 533314200.0 0.126323
1 3.182030e+09 17294 3.051684e+09 120698864.0 0.039552
GrAppv no. of cases ML_SBA_Appv ML_SBA_claim ML_SBA_loss_ratio
ML_grades
1 5.047406e+09 11030 5.047406e+09 83615000.0 0.016566
2 2.020042e+09 14667 1.616033e+09 164286548.2 0.101660
3 1.723608e+09 15393 1.034165e+09 192030002.4 0.185686
4 1.605039e+09 15989 6.420155e+08 177285631.2 0.276139
5 1.208722e+09 19878 2.417444e+08 106909154.6 0.442240
In [26]:
def field_comparsion(var, SBA = SBA_grade_df, ML = ML_SBA_grade_df):
    var1 = var.replace('ML_', '')
    df1 = pd.DataFrame(SBA[var1]).reset_index().sort_values('SBA_grades', ascending = False)
    df1 = df1.rename(columns = {'SBA_grades': 'Grades', var1: 'SBA'})
    df1 = df1.reset_index(drop = True)
    df2 = pd.DataFrame(ML[var]).reset_index()
    df2 = df2.rename(columns = {'ML_grades': 'Grades', var: 'ML'})
    df = pd.merge(df1, df2, on = 'Grades').set_index('Grades')
    return df
In [27]:
field_comparsion('no. of cases').iplot(kind = 'bar', title = 'no. of cases')
In [31]:
field_comparsion('ML_SBA_loss_ratio').iplot(kind = 'bar', title = 'Loss ratio')
In [33]:
field_comparsion('ML_SBA_claim').iplot(kind = 'bar', title = 'Claims')